### LOADING DATA ###
setwd("C:/Users/David/Google Drive/Github/task3-2-wifi-dgibert17")
df.train = read.csv("training.csv")
df.val = read.csv("validation.csv")
sum(names(df.train) == names(df.val))
## [1] 529
Ambos DF tienen los mismos atributos, aunque en el validation no hay registros para SPACEID, RELATIVEPOSITION ni USERID.
the validation fingerprints were taken 3 months later than the training ones, and some WAPs disappeared and new ones were introduced.
From the 520 detected WAPs in the UJIIndoorLoc database, 312 of them were detected in training and validation phases. 153 WAPs were only detected in the training phase, and 55 new WAPs appeared in the validation phase
Como se quitaron WAPs, en el validation test habra WAPs cuyos valores sean todo 100, porque al no estar, obviamente no podran ser registrados por un dispositivo.
Por tanto, primero quitaremos los WAPs del validation y del training que sean inutiles, mas adelante igualaremos el DF de training con los mismos atributos del validation.
WAPnotDetected.val = as.character()
for (i in 1:which(names(df.val)=="WAP520")){
if (sum(df.val[i]) == 100*nrow(df.val)){
#cat("El", names(df.val[i]), "no ha sido detectado por ningun dispositivo en el validation\n")
WAPnotDetected.val = c(WAPnotDetected.val, names(df.val[i]))
}
}
WAPnotDetected.tr = as.character()
for (i in 1:which(names(df.train)=="WAP520")){
if (sum(df.train[i]) == 100*nrow(df.train)){
#cat("El", names(df.train[i]), "no ha sido detectado por ningun dispositivo en el training\n")
WAPnotDetected.tr = c(WAPnotDetected.tr, names(df.train[i]))
}
}
# df.WAPnotDetected.tr = df.train[, c(WAPnotDetected.tr, names(df.train[521:529]))]
# cat("El", round(length(WAPnotDetected.val)/520*100),"% de los WAPs no han sido detectados por ningun dispositivo en el training. La universidad esta perdiendo dinero con estos hardware. (Hay que comprobar si ocurre lo mismo para el test).")
WAPnotDetected.val
## [1] "WAP002" "WAP005" "WAP006" "WAP007" "WAP079" "WAP133" "WAP157"
## [8] "WAP163" "WAP193" "WAP194" "WAP197" "WAP198" "WAP199" "WAP200"
## [15] "WAP205" "WAP206" "WAP208" "WAP209" "WAP210" "WAP211" "WAP212"
## [22] "WAP213" "WAP214" "WAP218" "WAP219" "WAP220" "WAP221" "WAP228"
## [29] "WAP230" "WAP231" "WAP235" "WAP250" "WAP251" "WAP252" "WAP291"
## [36] "WAP298" "WAP302" "WAP306" "WAP339" "WAP347" "WAP357" "WAP361"
## [43] "WAP363" "WAP366" "WAP367" "WAP368" "WAP369" "WAP370" "WAP371"
## [50] "WAP372" "WAP373" "WAP374" "WAP375" "WAP376" "WAP377" "WAP378"
## [57] "WAP379" "WAP380" "WAP381" "WAP382" "WAP383" "WAP384" "WAP385"
## [64] "WAP386" "WAP387" "WAP388" "WAP389" "WAP390" "WAP391" "WAP392"
## [71] "WAP393" "WAP394" "WAP395" "WAP396" "WAP397" "WAP398" "WAP399"
## [78] "WAP400" "WAP401" "WAP402" "WAP403" "WAP404" "WAP405" "WAP406"
## [85] "WAP407" "WAP408" "WAP409" "WAP410" "WAP411" "WAP412" "WAP413"
## [92] "WAP414" "WAP415" "WAP417" "WAP420" "WAP421" "WAP424" "WAP425"
## [99] "WAP427" "WAP428" "WAP430" "WAP431" "WAP432" "WAP435" "WAP436"
## [106] "WAP437" "WAP439" "WAP440" "WAP446" "WAP447" "WAP448" "WAP450"
## [113] "WAP453" "WAP454" "WAP455" "WAP457" "WAP459" "WAP460" "WAP461"
## [120] "WAP462" "WAP463" "WAP464" "WAP465" "WAP466" "WAP467" "WAP468"
## [127] "WAP469" "WAP470" "WAP471" "WAP472" "WAP473" "WAP474" "WAP476"
## [134] "WAP477" "WAP479" "WAP480" "WAP490" "WAP503" "WAP504" "WAP505"
## [141] "WAP506" "WAP507" "WAP509" "WAP510" "WAP511" "WAP512" "WAP513"
## [148] "WAP514" "WAP515" "WAP516" "WAP517" "WAP518" "WAP519"
which(names(df.val) %in% WAPnotDetected.val) #Estos son los indices de los atributos que no son detectados en el validation set, por lo que quitaremos estos atributos del validation set.
## [1] 2 5 6 7 79 133 157 163 193 194 197 198 199 200 205 206 208
## [18] 209 210 211 212 213 214 218 219 220 221 228 230 231 235 250 251 252
## [35] 291 298 302 306 339 347 357 361 363 366 367 368 369 370 371 372 373
## [52] 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
## [69] 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
## [86] 408 409 410 411 412 413 414 415 417 420 421 424 425 427 428 430 431
## [103] 432 435 436 437 439 440 446 447 448 450 453 454 455 457 459 460 461
## [120] 462 463 464 465 466 467 468 469 470 471 472 473 474 476 477 479 480
## [137] 490 503 504 505 506 507 509 510 511 512 513 514 515 516 517 518 519
df.val = df.val[, -which(names(df.val) %in% WAPnotDetected.val)]
df.train = df.train[, -which(names(df.train) %in% WAPnotDetected.tr)]
tr.in.val.idxs = which(names(df.train) %in% names(df.val))
df.train = df.train[, tr.in.val.idxs]
val.in.tr.idxs = which(names(df.val) %in% names(df.train))
df.val = df.val[, val.in.tr.idxs]
sum(names(df.train) == names(df.val))
## [1] 321
# intersect(x = names(df.train), y = names(df.val))
DEVICEnotDetected.tr = c()
DEVICEdetected1WAP.tr = c()
for (i in 1:nrow(df.train)){
if (sum(df.train[i, 1:last(grep(pattern = "WAP", names(df.train)))]) == 100*last(grep(pattern = "WAP", names(df.train)))){
#print(rownames(df.train[i,]))
DEVICEnotDetected.tr = c(DEVICEnotDetected.tr, as.numeric(rownames(df.train[i,])))
} else if (sum(df.train[i, 1:last(grep(pattern = "WAP", names(df.train)))] != 100) == 1){
#print(rownames(df.train[i,]))
DEVICEdetected1WAP.tr = c(DEVICEdetected1WAP.tr, as.numeric(rownames(df.train[i,])))
}
}
# names(df.train[first(grep("[^WAP0-9]", names(df.train)))])
df.DEVICESnotDetected.tr = df.train[DEVICEnotDetected.tr, 313:length(names(df.train))]
df.DEVICEdetected1WAP.tr = df.train[DEVICEdetected1WAP.tr, 313:length(names(df.train))]
DEVICESrm.tr = sort(c(DEVICEnotDetected.tr, DEVICEdetected1WAP.tr))
df.train = df.train[-DEVICESrm.tr, ]
Ahora el DF TRAINING solo contiene rows con capturas (dispositivos) que han detectado como minimo 2 WAPs.
DEVICEnotDetected.val = c()
DEVICEdetected1WAP.val = c()
for (i in 1:nrow(df.val)){
if (sum(df.val[i, 1:last(grep(pattern = "WAP", names(df.val)))]) == 100*last(grep(pattern = "WAP", names(df.val)))){
#print(rownames(df.val[i,]))
DEVICEnotDetected.val = c(DEVICEnotDetected.val, as.numeric(rownames(df.val[i,])))
} else if (sum(df.val[i, 1:last(grep(pattern = "WAP", names(df.val)))] != 100) == 1){
#print(rownames(df.val[i,]))
DEVICEdetected1WAP.val = c(DEVICEdetected1WAP.val, as.numeric(rownames(df.val[i,])))
}
}
DEVICEnotDetected.val
## NULL
DEVICEdetected1WAP.val
## [1] 1
df.val = df.val[-DEVICEdetected1WAP.val, ]
No hay ningun dispositivo en el VALIDATION que no haya sido detectado por ningun WAP (obviamente, ya que si no es detectado por WAPs no aparecera en esta base de datos). Todos han sido detectados por almenos 1 WAP. Ademas, solo hay un dispositivo que haya sido detectado por unicamente 1 WAP, el cual tambien quitamos.
### CREATING TRAIN BUILDINGS - LOOP ###
# df.trainB0 = data.frame(matrix(nrow = 0, ncol = length(names(df.train))))
# df.trainB1 = data.frame(matrix(nrow = 0, ncol = length(names(df.train))))
# df.trainB2 = data.frame(matrix(nrow = 0, ncol = length(names(df.train))))
#
# colnames(df.trainB0) = names(df.train)
# colnames(df.trainB1) = names(df.train)
# colnames(df.trainB2) = names(df.train)
#
#
# for (i in 1:nrow(df.train)){
# if (df.train[i, "BUILDINGID"] == 0){
# df.trainB0 = rbind(df.trainB0, df.train[i,])
# } else if (df.train[i, "BUILDINGID"] == 1) {
# df.trainB1 = rbind(df.trainB1, df.train[i,])
# } else {
# df.trainB2 = rbind(df.trainB2, df.train[i,])
# }
# }
### CREATING TRAIN BUILDINGS - PIPES ###
df.trainB0 = df.train %>%
filter(BUILDINGID == 0)
df.trainB1 = df.train %>%
filter(BUILDINGID == 1)
df.trainB2 = df.train %>%
filter(BUILDINGID == 2)
### CREATING VALIDATION BUILDINGS ###
df.valB0 = df.val %>%
filter(BUILDINGID == 0)
df.valB1 = df.val %>%
filter(BUILDINGID == 1)
df.valB2 = df.val %>%
filter(BUILDINGID == 2)
a <- list(
title = "",
zeroline = FALSE,
showline = FALSE,
showticklabels = FALSE,
showgrid = FALSE
)
plot_ly(data = df.train, x = ~LONGITUDE, y = ~LATITUDE, color = ~factor(BUILDINGID)) %>%
layout(xaxis = a,
yaxis = a)
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
plot_ly(data = df.train, x = ~LONGITUDE, y = ~LATITUDE, z = ~factor(FLOOR),color = ~factor(BUILDINGID)) %>%
layout(xaxis = a,
yaxis = a)
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
plot_ly(data = df.trainB0, x = ~LONGITUDE, y = ~LATITUDE, z = ~factor(FLOOR),
color = ~factor(FLOOR)) %>%
layout(title = "Building 0",
xaxis = a,
yaxis = a
)
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
plot_ly(data = df.trainB1, x = ~LONGITUDE, y = ~LATITUDE, z = ~factor(FLOOR),
color = ~factor(FLOOR)) %>%
layout(title = "Building 1",
xaxis = a,
yaxis = a
)
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
plot_ly(data = df.trainB2, x = ~LONGITUDE, y = ~LATITUDE, z = ~factor(FLOOR),
color = ~factor(FLOOR)) %>%
layout(title = "Building 2",
xaxis = a,
yaxis = a
)
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
ggplot(data = df.val, mapping = aes(x = LONGITUDE, y = LATITUDE, color = as.factor(BUILDINGID))) +
geom_point()
ggplot(data = df.valB0, mapping = aes(x = LONGITUDE, y = LATITUDE)) +
geom_point(color = "Red")
ggplot(data = df.valB1, mapping = aes(x = LONGITUDE, y = LATITUDE)) +
geom_point(color = "DarkGreen")
ggplot(data = df.valB2, mapping = aes(x = LONGITUDE, y = LATITUDE)) +
geom_point(color = "Blue")
Parece que la distribucion de las observaciones del validation es buena ya que crea la forma del edifico como tal. Aunque estos son los valores reales o conocidos, lo interesante sera hacer el mismo plot de la distribucion de las observaciones cuando hayamos hecho la prediccion sobre el building, floor…
table(df.trainB0$WAP001)
##
## -97 -96 -95 -94 -93 100
## 2 8 3 4 1 5230
table(df.trainB1$WAP001)
##
## 100
## 5151
table(df.trainB2$WAP001)
##
## 100
## 9453
table(df.valB0$WAP001)
##
## -94 -93 -92 -85 100
## 1 1 4 2 528
table(df.valB1$WAP001)
##
## 100
## 306
table(df.valB2$WAP001)
##
## 100
## 268
Esto indica que podemos eliminar los datos del WAP001 del edificio 1 y 2. Sabemos que ese WAP pertenece al edificio 0 y no esta detectado por ningun dispositivo en otro edificio.